2019-07-02 16:39:11

Student number 134323

3 week Extension granted (email correspondence 12th June 2019)

# Example data
library(microbiome)
library(dplyr)
library(table1)
library(captioner)

data(dietswap)

# assure namespace issue is fixed
transform <- microbiome::transform

#devtools::install_github("hrbrmstr/hrbrthemes")
library(hrbrthemes)
library(gcookbook)
library(tidyverse)
library(phyloseq)


shuffle <- function(x, set.seed = 42){
  set.seed(set.seed)
  sample(x, length(x), replace = FALSE)
}

#pal <- shuffle(scico::scico(n = 17, palette ='berlin'), set.seed = 99)
pal <- shuffle(c(scico::scico(n = 11, palette ='berlin'),
         scico::scico(n = 11, palette ='hawaii')), set.seed = 99)

Background

Colon cancer is a leading cause of cancer death in North America and Europe. Despite advances in early detection widely available in North America, African Americans (AAM) have a far higher incidence of death by colon cancer (65:100,000) than rural Africans (AFR), at <5:100,000 (O’Keefe et al. 2015). Although colon cancer risk is determined by many factors, diet may explain some of the differences between AAM and AFR.

In order to investigate differences in gut microbiota potentially caused by diet between AAM and AFR, O’Keefe and colleagues collected stool samples before and after a dietary intervention in age and sex matched samples of healthy AAM and AFR participants (O’Keefe et al. 2015). A HITChip phylogenetic microarray was used to profile microbiota composition.

For this reproducible workflow we obtained data from the O’Keefe study, which was included in the microbiome R package (Lahti and Shetty, n.d.). Data were prepared loosely following the tutorial Introduction to the microbiome R package, filtering on pre-intervention and taking the first sample collected within each group in order to avoid pseudo-replication considerations. We also filtered on the most prevalent taxa to reduce the data in order to aid interpretation and reduce computation time.

# Use most prevalent taxa to speed up examples
# detection threshold >50 otus, prevalence > .4
pseq <- core(dietswap, detection = 50, prevalence = .4)

# choose sample subset, first sample collection pre-intervention
pseq2 <- subset_samples(pseq, group == "HE" & timepoint.within.group == 1)

# aggregate by phyla
pseq3 <- pseq2 %>%
          aggregate_taxa(level = "Phylum") %>%  
          microbiome::transform(transform = "compositional")
# working with phyloseq objects is not difficult but 
# for some tasks without `phyloseq` or `microbiome` methods
# a data.frame or tibble can be helpful for a novice like me

otu <- tibble::rownames_to_column(as.data.frame(t(otu_table(pseq2)))) %>%
  mutate(sample = rowname) %>%
  select(-rowname)

d <- as_tibble(sample_data(pseq2)) %>% 
  left_join(otu)

label(d$sex) <- "Gender"
label(d$bmi_group) <- "BMI category"

table_nums <- captioner::captioner(prefix = "Table")
tab.1 <- table_nums(name = "tab_1",
                    caption = "Demographics, African American (AAM) and rural African (AFR) participants")
fig_nums <- captioner::captioner(prefix = "Figure")
fig.1 <- fig_nums(name = "fig_1",
                  caption = "Abundance of phyla, samples sorted by abundance of bacteroidetes")

fig.2 <- fig_nums(name = "fig_2",
                  caption = "Abundance of microbial species in each sample, sorted by study groups")

fig.3 <- fig_nums(name = "fig_3",
                  caption = "Abundance of species averaged by BMI group")

fig.4 <- fig_nums(name = "fig_4",
                  caption = "tSNE ordination of species composition similarity in two-dimensions")

fig.5 <- fig_nums(name = "fig_5",
                  caption = "tSNE ordination of sample similarity by species composition, shaded by abundance of Prevotella melaninogenica. Shapes show nationality of participant")

fig.6 <- fig_nums(name = "fig_6",
                  caption = "Animation showing robustness of tSNE to varying perplexity. Note that similarities are preserved as clusters but scales are not")

fig.7 <- fig_nums(name = "fig_7",
                  caption = "Animated tSNE with perplexity = 9, theta = .05, 5000 iterations. Shading varies by microbial species, abundances are log-transformed and standardized by dividing by 1SD of each species")

fig.8 <- fig_nums(name = "fig_8",
                  caption = "Heatmap of standardized (but not centered) abundance of species. Samples and species sorted using classical multidimensional scaling. Sample labels coloured by nationality (AAM in red, AFR in black)")

Sample demographics

Table 1: Demographics, African American (AAM) and rural African (AFR) participants

table1(~ sex +  bmi_group | nationality, d)
AAM
(n=21)
AFR
(n=16)
Overall
(n=37)
Gender
female 9 (42.9%) 8 (50.0%) 17 (45.9%)
male 12 (57.1%) 8 (50.0%) 20 (54.1%)
BMI category
lean 3 (14.3%) 6 (37.5%) 9 (24.3%)
overweight 9 (42.9%) 4 (25.0%) 13 (35.1%)
obese 9 (42.9%) 6 (37.5%) 15 (40.5%)

Figures

theme_set(theme_bw(21))
p <- pseq3 %>%
    plot_composition(sample.sort = "Bacteroidetes", otu.sort = "abundance", alpha = 0.3) +
         # Set custom colors
        scale_fill_manual(values = pal, name = "Phylum") +
  facet_grid(~Tax, scales = "free_y") +
  theme(text = element_text(size = 8),
    axis.text.x = element_blank())


print(p)
Figure  1: Abundance of phyla, samples sorted by abundance of bacteroidetes

Figure 1: Abundance of phyla, samples sorted by abundance of bacteroidetes

Relative abundance of core taxa sorted by study group (AAM and AFR) shows differential abundance of species (figure 2). Note relative abundance of Prevotella melaninogenica (shaded in pink), which is more abundant in the AFR group.

# Limit the analysis on core taxa and specific sample group
p <- plot_composition(pseq2,
              taxonomic.level = "Genus",
                      sample.sort = "nationality",
                      x.label = "nationality") +
     guides(fill = guide_legend(ncol = 1)) +
   scale_fill_manual(values = pal, name = "Genus") +
     scale_y_percent() +
     labs(x = "Samples", y = "Relative abundance (%)",
                                   title = "Relative abundance data",
                                   subtitle = "",
                                   caption = "") + 
    # theme_ipsum(grid="Y") +
  theme(text = element_text(size = 8),
    axis.text.x = element_text(size = 6, angle = 90),
        legend.key.height = unit(2, "mm"))
print(p)  
Figure  2: Abundance of microbial species in each sample, sorted by study groups

Figure 2: Abundance of microbial species in each sample, sorted by study groups

Averaging abundance by BMI group shows a greater abundance of Prevotella melaninogenica in lean participants (figure 3), but Table 1 shows more lean rural Africans than African Americans so this pattern must be interpreted with caution. Additionally, more abundant taxa visually dominate the plots, while greater differences in relative abundance may be less obvious for less abundant taxa.

# Averaged by group
p <- plot_composition(pseq2,
                      average_by = "bmi_group", transform = "compositional") +
  scale_fill_manual(values = pal) +
  theme(text = element_text(size = 10),
        legend.key.height = unit(4, "mm"))
print(p)
Figure  3: Abundance of species averaged by BMI group

Figure 3: Abundance of species averaged by BMI group

Ordination

Ordination reduces high-dimensional data to lower-dimensional data, such that objects that are similar to each other are projected onto a lower-dimensional space ‘nearer’ to each other and objects that are dissimilar are projected further apart.

We compute a distance matrix of species based on sample similarities then reduce these distances to a two-dimensional space using t-distributed stochastic neighbour embeddings (tSNE) (Maaten and Hinton 2008). Taxa that appear in similar compositions in samples should be nearer to each other than taxa that are dissimilar (figure 4).

library(Rtsne)
library(vegan)
set.seed(42) # for reproducibility

# distance matrix (sample similarities)
ps <- microbiome::transform(pseq2, "hellinger")
dm <- vegdist(otu_table(ps), "euclidean")

m1 <- Rtsne(dm, dims = 2, perplexity = 7, theta = 0.1, max_iter = 5000, verbose = F)

proj <- m1$Y
rownames(proj) <- rownames(otu_table(ps))
proj <- tibble::rownames_to_column(as.data.frame(proj)) # so we can use rownames as labels
ggplot(proj, aes(x = V1, y = V2, label = rowname)) +
  geom_text(nudge_y = -2, size = 2.5) +
  geom_point() +
  xlim(-100, 120)
Figure  4: tSNE ordination of species composition similarity in two-dimensions

Figure 4: tSNE ordination of species composition similarity in two-dimensions

Using a similar approach, we ordinate samples on two dimensions using tSNE. This could be considered an unsupervised learning method, because tSNE doesn’t know about nationality, BMI etc, only (standardized) abundance of taxa in each sample. In figure 5 we plot the ordination using symbols for nationality and colour by abundance of a species of interest.

A <- apply(d[, c(9:25)], 2, function(x) scale(log(x)))  # standardize across taxa

set.seed(42)
m2 <- Rtsne(A, dims = 2, perplexity = 9, theta = .05, max_iter = 5000, verbose = F)

# put results in data.frame for plotting
B1 <- data.frame(d,
                 m2$Y)
ggplot(B1, aes(x = X1, y = X2,
               shape = nationality,
               colour = `Prevotella.melaninogenica.et.rel.`)) +
  geom_point(size = 3) +
  scico::scale_colour_scico(palette = "vik") +
  theme_minimal()
Figure  5: tSNE ordination of sample similarity by species composition, shaded by abundance of Prevotella melaninogenica. Shapes show nationality of participant

Figure 5: tSNE ordination of sample similarity by species composition, shaded by abundance of Prevotella melaninogenica. Shapes show nationality of participant

This realisation of the data clearly shows two clusters, one composed entirely of samples from African-Americans with low abundance of Prevotela melaninogenica and the other composed mostly of rural Africans with high abundance of Prevotela melaninogenica. I say “this realisation of the data”, because tSNE is a stochastic algorithm and a different throw of the dice, with different parameters, can produce different low-dimensional representations of the data. The risk here is that users may choose a low-dimensional representation that supports their pre-conceived ideas (in this case, there are implied hypotheses about nationality and Prevotela melaninogenica that the figure appears to support). Some [hyper-]parameter sensitivity analysis is prudent.

Parameters to tune include learning rate, max iterations, and perplexity (which we will describe in the following paragraph). Learning rate (theta in the Rtsne package) makes a trade-off between speed and accuracy, with theta = 0 being ‘pure’ tSNE and theta in (0, 1] being an approximation (in the above example theta = 0.05). The number of iterations can be assessed by confirming that the algorithm has converged (i.e that the clusters represented in low-dimensional space are stable when the number of iterations is increased by some non-trivial amount).

An additional hyper-parameter, somewhat unique to tSNE, is perplexity which aims to balance representation of global and local relationships in the data (Wattenberg, Viégas, and Johnson 2016). Figure 6 shows robustness to choice of perplexity. It is important to note that tSNE aims to preserve distance relationships between objects, such that objects that are similar cluster together in low-dimensional space and objects that are different are further apart. It does not preserve Euclidean distances in high-dimensional space, and thus the scales are meaningless (which is clear in the animation).

set.seed(42)

perplexity_iter <- function(x, p){
  Rtsne(x, dims = 2, perplexity = p, max_iter = 2000, theta = 0.05, verbose = FALSE)
}
# make a list of tSNE fits with varying perplexity
m <- lapply(c(4, 7, 9, 12), function(p) perplexity_iter(x = A, p = p))

extract_ys <- function(m){
  perplexity <- m$perplexity
  Y1 <- m$Y[,1]
  Y2 <- m$Y[,2]
  cbind(d, Y1, Y2, perplexity)
}

# put all the fits in a data frame for the animation
k <- bind_rows(lapply(m, extract_ys))
# animate to show effect of varying perplexity (similarity preserved, absolute scale not informative)
library(gganimate)
ggplot(k, aes(x = Y1, y = Y2, shape = nationality, colour = `Prevotella melaninogenica et rel.`)) +
  geom_point(size = 4) +
  scico::scale_colour_scico(palette = "vik") +
  theme_minimal() +
  transition_states(perplexity) +
  ease_aes('linear') +
  view_follow() +
  labs(title = 'Perplexity = {closest_state}')
Figure 6: Animation showing robustness of tSNE to varying perplexity. Note that similarities are preserved as clusters but scales are not

Figure 6: Animation showing robustness of tSNE to varying perplexity. Note that similarities are preserved as clusters but scales are not

How does tSNE achieve this? Stochastic Neighbour Embedding (SNE) defines the similarity of datapoints \(x_i\) and \(x_j\) as the conditional probability, \(p_{j|i}\), that \(x_i\) would ‘choose’ \(x_j\) as its neighbour if neighbour choices were made in proportion to their probability density under a Gaussian centered at \(x_i\) in high-dimensional space. A similar conditional probability, \(q_{j|i}\) is computed for low-dimensional counter-parts, \(y_i\) and \(y_j\). Using gradient descent, SNE minimises the sum of Kullback-Leibler divergences (Maaten and Hinton 2008), \[\sum_i \sum_j p_{j|i}log \frac {p_{j|i}}{q_{j|i}} \]

Shading datapoints by genera abundance (figure 7) gives a sense of the low-dimensional spatial relationships between them.

# make a data frame that colours points one column/genera at a time
foo <- function(x){
  d %>% mutate(col = scale(log(unlist(d[, x])), center = FALSE),
               Genus = x,
                    Y1 = m2$Y[, 1],
                    Y2 = m2$Y[, 2])
  
}

k <- lapply(names(d[, 9:30]), foo)
k <- dplyr::bind_rows(k)
p <- ggplot(k, aes(x = Y1, y = Y2, frame = Genus, shape = nationality, colour = col)) +
  geom_point(size = 5) +
  scico::scale_color_scico(palette = "vik", name = "Abundance log(z)") +
  theme_minimal()

plotly::ggplotly(p)

Figure 7: Animated tSNE with perplexity = 9, theta = .05, 5000 iterations. Shading varies by microbial species, abundances are log-transformed and standardized by dividing by 1SD of each species

A heatmap shows the relative abundance of species over samples (figure 8). Sorting the samples and the taxa using an ordination method (here we have used classical multidimensional scaling) can reveal multivariate patterns in the data. The code snippet below shows a general method for producing a plot like this using almost any ordination method. The microbiome package has plot methods with a few commonly used distance and ordination methods.

set.seed(42)

# sort genera using Classical Multidimensional Scaling
D <- dist(A, method = "euclidean")
m3 <- cmdscale(D, k = 1)

# sort samples using Classical Multidimensional Scaling
m4 <- cmdscale(dm, k = 1)
proj <- m4[,1]
proj <- tibble::rownames_to_column(as.data.frame(proj)) %>% arrange(proj)




# arrange samples and taxa by ordination above
B1 <- data.frame(d,
                 m3.Y = m3[,1]) %>%
  arrange(m3.Y) %>%  # arrange samples by ordination provided in m3.Y
  mutate(sample = factor(sample, sample)) %>%  # make levels the same as arranged order
  reshape2::melt(id.vars = c("subject", "sex", "sample", "nationality", "group", "timepoint", "timepoint.within.group", "bmi_group", "m3.Y")) %>%
  group_by(variable) %>%
  mutate(value = scale(value, center = FALSE)) %>%  # scale abundance for each genus
  ungroup() %>%
  mutate(variable2 = factor(gsub("\\.", " ", variable),
                            levels = gsub("\\.", " ", proj$rowname))) # arrange genera by ordination
# plot a heatmap
ggplot(B1, aes(x = variable2, y = sample,
               fill = value)) +
  geom_tile() +
  scico::scale_fill_scico(palette = "bilbao", name = "Abundance (z)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
        axis.text.y = element_text(size = 6, colour = B1$nationality)) +
  ylab("Sample") +
  xlab("Genus")
Figure  8: Heatmap of standardized (but not centered) abundance of species. Samples and species sorted using classical multidimensional scaling. Sample labels coloured by nationality (AAM in red, AFR in black)

Figure 8: Heatmap of standardized (but not centered) abundance of species. Samples and species sorted using classical multidimensional scaling. Sample labels coloured by nationality (AAM in red, AFR in black)

Data and source code

The R code for this reproducible workflow is stored in a public repository and can be cloned from github.com/ABindoff/kma712_ass2

References

Lahti, Leo, and Sudarshan Shetty. n.d. “Microbiome R Package.”

Maaten, Laurens van der, and Geoffrey Hinton. 2008. “Visualizing Data Using T-Sne.” Journal of Machine Learning Research 9 (Nov): 2579–2605.

O’Keefe, Stephen JD, Jia V Li, Leo Lahti, Junhai Ou, Franck Carbonero, Khaled Mohammed, Joram M Posma, et al. 2015. “Fat, Fibre and Cancer Risk in African Americans and Rural Africans.” Nature Communications 6: 6342.

Wattenberg, Martin, Fernanda Viégas, and Ian Johnson. 2016. “How to Use T-Sne Effectively.” Distill. https://doi.org/10.23915/distill.00002.

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
## [5] LC_TIME=English_Australia.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.23
## 
## loaded via a namespace (and not attached):
##  [1] xfun_0.7            tidyselect_0.2.5    purrr_0.3.2        
##  [4] reshape2_1.4.3      splines_3.6.0       lattice_0.20-38    
##  [7] rhdf5_2.28.0        colorspace_1.4-1    stats4_3.6.0       
## [10] mgcv_1.8-28         survival_2.44-1.1   rlang_0.3.4        
## [13] pillar_1.4.1        glue_1.3.1          BiocGenerics_0.30.0
## [16] foreach_1.4.4       plyr_1.8.4          stringr_1.4.0      
## [19] zlibbioc_1.30.0     Biostrings_2.52.0   munsell_0.5.0      
## [22] gtable_0.3.0        evaluate_0.14       codetools_0.2-16   
## [25] phyloseq_1.28.0     Biobase_2.44.0      permute_0.9-5      
## [28] IRanges_2.18.1      biomformat_1.12.0   parallel_3.6.0     
## [31] Rcpp_1.0.1          scales_1.0.0        vegan_2.5-5        
## [34] S4Vectors_0.22.0    jsonlite_1.6        XVector_0.24.0     
## [37] ggplot2_3.2.0       stringi_1.4.3       Rtsne_0.15         
## [40] dplyr_0.8.1         grid_3.6.0          ade4_1.7-13        
## [43] tools_3.6.0         magrittr_1.5        lazyeval_0.2.2     
## [46] tibble_2.1.3        cluster_2.1.0       tidyr_0.8.3        
## [49] crayon_1.3.4        ape_5.3             pkgconfig_2.0.2    
## [52] MASS_7.3-51.4       Matrix_1.2-17       data.table_1.12.2  
## [55] assertthat_0.2.1    rstudioapi_0.10     iterators_1.0.10   
## [58] microbiome_1.6.0    Rhdf5lib_1.6.0      R6_2.4.0           
## [61] compiler_3.6.0      multtest_2.40.0     igraph_1.2.4.1     
## [64] nlme_3.1-140